#
# mergeCrossSectionalItems.R
#
# Merge survey items from the cross-sectional ANES
# files not included in the cumulative ANES file to
# the cumulative file.
#
# Hill, Seth J. and Chris Tausanovitch. "A Disconnect in Representation? Comparison of Trends in Congressional and Public Polarization."
#
#
# Procedure documentation:
# 1. I manually perused the codebooks for each cross-sectional survey from 1952 to 1966 for ideological items.
# 2. I manually matched items across years that were identical or close to identical.
# 3. I manually matched response categories for the cross-year items. Note the problem with the school construction item in 1962 I mention above.
# 4. I dropped foreign policy questions, even though they provide glue. I kept a question about firing communists without strong evidence from 1956, as that seemed sufficiently domestic. 'THE GOVERNMENT OUGHT TO FIRE ANY GOVERNMENT WORKER WHO IS ACCUSED OF BEING A COMMUNIST EVEN THOUGH THEY DON'T PROVE IT.' [I have no problem if you want to exclude.]
# 5. I coded all items to common response categories, and re-coded NA/DK/Refused to responses 7+.
# 6. I stacked the cross-sectional variables, and merged them to the cumulative file on respondent case id and year.
# 7. I checked that the merge was successful by comparing the age variable from each file. It looked spot on.
# 8. 281 cases from the 1964 cross-sectional file did not match to the cumulative. They have case id numbers all greater than the max case id in the cumulative, so there is likely some systematic explanation. Maybe a Civil Rights era over-sample or something? I did not investigate further.
#

library(foreign)
myFrameStacker <- function(d1,d2,missingVal=NA) {
  # Function takes two data frames and returns one stacked data frame with 
  # nrow = nrow(d1) + nrow(d2) and ncol = length(union(names(d1),names(d2))).
  # Arguments:
  # d1 -- data frame 1.
  # d2 -- data frame 2.
  # missingVal -- value to enter for rows where column names do not have a match.
  if (is.null(d1)) return(d2)
  if (is.null(d2)) return(d1)
  rows1 <- nrow(d1)
  rows2 <- nrow(d2)
  
  d1l <- as.list( d1 )
  d2l <- as.list( d2 )
  out <- list()
  for ( v in union(names(d1l),names(d2l)) ) {
    if ( v%in%names(d1l)&v%in%names(d2l) ) {    # If v in both frames
      out[[v]] <- c(d1l[[v]],d2l[[v]])
      next
    }
    if ( v%in%names(d1l) ) {        # If v only in d1
      out[[v]] <- c(d1l[[v]],rep(missingVal,rows2))
      next
    }
    if ( v%in%names(d2l) ) {        # If v only in d2
      out[[v]] <- c(rep(missingVal,rows1),d2l[[v]])
      next
    }
  }
  return( as.data.frame(out,stringsAsFactors=F) )
}

# Merging/checking variables.
cs.age <- c('520142','540025','560295','580174','600122','620009','640187','660192')
cs.ids <- sprintf("V%s0002",seq(from=52,by=2,length.out=length(cs.age))) # common format!

# Cross-sectional items.
cs.items <- read.csv("crossSecVarMap.csv")
cs.items <- cs.items[cs.items$InCumulativeFile == 0,] # Only those not in cumulative.
cs.items$VariableNumber <- sprintf("V%s",cs.items$VariableNumber)
cs.items$MapsTo <- sprintf("V%s",cs.items$MapsTo)

# Call in data.
cs.data <- list()
for (study in unique(cs.items$StudyYear)) {
  cat("Calling in eleciton study from 19",study,"...\n",sep="")
  tmp <- read.dta(file=sprintf("NES19%s.dta",study),convert.factors=T)
  # Rename age variable.
  tmp$VCF0102.check <- tmp[,names(tmp) %in% sprintf("V%s",cs.age)]
  # Rename id variable.
  tmp$VCF0006 <- tmp[,names(tmp) %in% cs.ids]
  # Create year variable.
  tmp$VCF0004 <- 1900+study
  keep.vars <- intersect(names(tmp),c("VCF0004","VCF0006","VCF0102.check",cs.items$VariableNumber))
  cat(" Keeping variables",paste(keep.vars,sep=","),"\n");flush.console()
  cs.data[[paste(study)]] <- tmp[,keep.vars]
}

#
cat("\n=====\nRecoding to common responses...\n=======\n")
#
# Manual set of recodes.
recoder <- vector("list",length(cs.age))
names(recoder) <- paste(seq(from=52,by=2,length.out=length(cs.age)))
recoder[["52"]] <- list("V520045"="6:hi=8",
                        "V520047"="4=3;5:6=4;7=5;8:9=6;10:11=8",
                        "V520050"="5=2;6=3;7:8=4;9=5;10:hi=8")
recoder[["56"]] <- list("V560038"="1=7;2=1;3=2;4=3;5=4;6=5",
                        "V560059"="1=7;2=1;3=2;4=3;5=4;6=5",
                        "V560053"="1=7;2=1;3=2;4=3;5=4;6=5")
recoder[["58"]] <- list("V580021"="8=9;7=8;6=7",
                        "V580027"="8=9;7=8;6=7")
recoder[["60"]] <- list("V600066"="1=7;2=1;3=2;4=3;5=4;6=5",
                        "V600052"="1=7;2=1;3=2;4=3;5=4;6=5",
                        "V600058"="1=7;2=1;3=2;4=3;5=4;6=5")
recoder[["62"]] <- list("V620055"="1=7;2=1;3=3;4=5;5=7;6=9")
recoder[["64"]] <- list("V640091"="1=7;2=1;3=2;4=3;5=4;6=5",
                        "V640090"="1=7;2=1;3=2;4=3;5=4;6=5",
                        "V640093"="1=7;2=1;3=2;4=3;5=4;6=5",
                        "V640143"="1=7;2=1;3=2;4=3;5=4;6=5",
                        "V640145"="5:hi=8")
recoder[["66"]] <- list("V660023"="1=7;2=1;3=2;4=3;5=4;6=5")
# Recode items.
for (item in sort(unique(cs.items[,"MapsTo"]))) {
  cat("\nCreating common responses for item",item,"...\n")
  recodes <- cs.items[cs.items$MapsTo == item,]
  for (i in 1:nrow(recodes)) {
    year <- paste(recodes[i,"StudyYear"])
    cat(year,recodes[i,"VariableNumber"],"\n")
    print(levels(cs.data[[year]][,names(cs.data[[year]]) %in% recodes$VariableNumber]))
    # Create new variable number.
    if (recodes[i,"VariableNumber"] != recodes[i,"MapsTo"]) {
      cat(" Creating new variable",recodes[i,"MapsTo"],"in",year,"\n")
      cs.data[[year]][,recodes[i,"MapsTo"]] <- as.numeric(cs.data[[year]][,recodes[i,"VariableNumber"]])
    } else {
      cs.data[[year]][,recodes[i,"MapsTo"]] <- as.numeric(cs.data[[year]][,recodes[i,"VariableNumber"]])
    }
    # Recode if needed.
    if (recodes[i,"VariableNumber"] %in% names(recoder[[year]])) {
      library(car)
      cat(" Recoding responses to",recodes[i,"VariableNumber"],"as",
          recoder[[year]][[recodes[i,"VariableNumber"]]],"...\n")
      x2 <- recode(cs.data[[year]][,recodes[i,"MapsTo"]],recoder[[year]][[recodes[i,"VariableNumber"]]])
      # Check to make sure this works...
      #if (recodes[i,"VariableNumber"] == "V620055") print(table(cs.data[[year]][,recodes[i,"MapsTo"]],x2))
      cs.data[[year]][,recodes[i,"MapsTo"]] <- x2
    }
  }
}

# Stack them all together.
cs.stacked <- NULL
for (i in 1:length(cs.data)) {
  stack.vars <- names(cs.data[[i]]) %in% c("VCF0004","VCF0006","VCF0102.check",cs.items$MapsTo)
  cs.stacked <- myFrameStacker(cs.stacked,cs.data[[i]][,stack.vars])
}
cat("Finished with stacked items:\n")
print(cs.items[cs.items$VariableNumber %in% names(cs.stacked),])

# Check on responses.
for (item in cs.items[cs.items$VariableNumber %in% names(cs.stacked),"VariableNumber"]) {
  cat("Tabulation of",paste(cs.items[cs.items$VariableNumber == item,"ShortName"]),"by year:\n")
  print(table(cs.stacked$VCF0004,cs.stacked[,item],exclude=NULL))
}

#
# Make sure it merges to cumulative.
#
load(file="anes_cdf.RData")
merge.vars <- intersect(names(cs.stacked),names(anes.cdf.with.2012))
cat("Found variables",paste(merge.vars,collapse=","),"to merge cumulative to cross-section...\n")
cat("Starting with dimensions",dim(anes.cdf.with.2012),"and",dim(cs.stacked),"...\n")
if (T) { # diagnosing incomplete match.
  # Matching variables as string vectors.
  cs.merger <- apply(cs.stacked[,merge.vars],1,toString)
  cdf.merger <- apply(anes.cdf.with.2012[,merge.vars],1,toString)
  summary(cdf.merger[anes.cdf.with.2012$VCF0004 == 1964] %in% cs.merger)
  summary(cs.merger[cs.stacked$VCF0004 == 1964] %in% cdf.merger)
  case.counts <- as.data.frame(table(anes.cdf.with.2012$VCF0004))
  case.counts <- merge(case.counts,as.data.frame(table(cs.stacked$VCF0004)),by="Var1",suffixes=c(".cum",".cs"),all=T)
  print(case.counts)
}
cat("\nNOTE: 281 extra cases from 1964 cross not in cumulative...\n")
anes.cdf.with.2012 <- merge(anes.cdf.with.2012,cs.stacked,by=merge.vars,all.x=T,all.y=F)
cat("Ending with dimensions",dim(anes.cdf.with.2012),"and",dim(cs.stacked),"...\n")

cat("Checking that age variable looks good...\n")
print(with(anes.cdf.with.2012,table(VCF0102,VCF0102.check)))
anes.cdf.with.2012$VCF0102.check <- NULL

# Create data frame describing items added.
extra.vars.50s.60s <- cs.items[cs.items$VariableNumber %in% names(anes.cdf.with.2012),c("VariableNumber","ShortName")]


save(anes.cdf.with.2012,additional.2012.items,extra.vars.50s.60s,
      file="anes_cdf_extra_50s60s.RData")
